# AMAR ET AL. - COUNTERING MISINFORMATION EARLY (2025)
## REPLICATION FILE: 18_census_balance_test.R
### This script creates a balance table for treatment and control villages.
# ----
# Load census data ----


shrug_bimli <- read_csv("./data/raw/census/shrug_bimli_final.csv")


# Comparison between treatment and control ----


# Calculate proportions
shrug_bimli_table <- shrug_bimli %>%
  mutate(
    pc11_pca_p_sc = pc11_pca_p_sc / pc11_pca_tot_p,
    pc11_pca_p_st = pc11_pca_p_st / pc11_pca_tot_p,
    pc11_pca_p_lit = pc11_pca_p_lit / pc11_pca_tot_p,
    treat = as.numeric(treat)
  )


# Create variable labels (stored as named vector)
var_labels <- c(
  pc11_pca_tot_p = "Total population",
  pc11_pca_p_sc = "Share SC population",
  pc11_pca_p_st = "Share ST population",
  pc11_pca_p_lit = "Share literate",
  pc11_vd_area = "Total area",
  pc11_vd_s_sch = "No. of secondary schools",
  pc11_vd_tar_road = "Pucca road",
  pc11_pca_no_hh = "No. of households",
  pc11_vd_p_sch = "No. of primary Schools",
  pc11_vd_m_sch = "No. of middle Schools",
  pc11_vd_power_dom = "Power supply for domestic use",
  pc11_vd_power_agr = "Power supply for agricultural use"
)


# Variables for the balance table
table_vars <- c("pc11_pca_no_hh", "pc11_pca_tot_p", "pc11_pca_p_sc", "pc11_pca_p_st", 
                "pc11_pca_p_lit", "pc11_vd_area", "pc11_vd_p_sch", "pc11_vd_m_sch", 
                "pc11_vd_s_sch", "pc11_vd_tar_road", "pc11_vd_power_dom", "pc11_vd_power_agr")


# Function to add significance stars
add_stars <- function(p_val) {
  if (p_val < 0.01) return("***")
  if (p_val < 0.05) return("**")
  if (p_val < 0.1) return("*")
  return("")
}


# Create the balance table
balance_results <- data.frame()


for (var in table_vars) {
  # Calculate means for treatment and control groups
  mean_treat <- shrug_bimli_table %>%
    filter(treat == 1) %>%
    summarise(mean = mean(!!sym(var), na.rm = TRUE)) %>%
    pull(mean)
  
  mean_control <- shrug_bimli_table %>%
    filter(treat == 0) %>%
    summarise(mean = mean(!!sym(var), na.rm = TRUE)) %>%
    pull(mean)
  
  # Run t-test regression
  formula_str <- paste(var, "~ treat")
  model <- lm(as.formula(formula_str), data = shrug_bimli_table)
  model_summary <- summary(model)
  
  # Extract results
  coef_treat <- model_summary$coefficients["treat", "Estimate"]
  se_treat <- model_summary$coefficients["treat", "Std. Error"]
  p_val <- model_summary$coefficients["treat", "Pr(>|t|)"]
  n_obs <- nobs(model)
  
  # Add significance stars
  stars <- add_stars(p_val)
  
  # Store results
  balance_results <- rbind(balance_results, data.frame(
    variable = var,
    var_label = var_labels[var],
    n = n_obs,
    mean_treat = round(mean_treat, 3),
    mean_control = round(mean_control, 3),
    diff = round(coef_treat, 2),
    se = round(se_treat, 2),
    p_value = round(p_val, 3),
    stars = stars,
    stringsAsFactors = FALSE
  ))
}


# Create LaTeX table
output_dir <- "./output/tables/"
if (!dir.exists(output_dir)) {
  dir.create(output_dir, recursive = TRUE)
}


file_path <- paste0(output_dir, "tab_treatment_control_villages.tex")


# Write LaTeX table
cat("\\begin{table}[htpb!]\n", file = file_path)
cat("\\centering\n", file = file_path, append = TRUE)
cat("\\caption{Balance Between Media Literacy and Control Villages}\n", file = file_path, append = TRUE)
cat("\\label{tab:balance}\n", file = file_path, append = TRUE)
cat("\\begin{tabular}{lcccccc} \\hline\n", file = file_path, append = TRUE)
cat(" & \\textbf{N} & \\textbf{Treatment} & \\textbf{Control} & \\textbf{Diff.} & \\textbf{SE} & \\textbf{p-value} \\\\ \\hline\n", file = file_path, append = TRUE)


# Write data rows
for (i in 1:nrow(balance_results)) {
  row <- balance_results[i, ]
  cat(sprintf("%s & %d & %.3f & %.3f & %.2f%s & %.2f & %.3f \\\\\n",
              row$var_label, row$n, row$mean_treat, row$mean_control, 
              row$diff, row$stars, row$se, row$p_value),
      file = file_path, append = TRUE)
}


# Write table footer
cat("\\hline\n", file = file_path, append = TRUE)
cat("*p$<$0.05; **p$<$0.01; ***p$<$0.001 \\\\\n", file = file_path, append = TRUE)
cat("\\end{tabular}\n", file = file_path, append = TRUE)
cat("\\end{table}\n", file = file_path, append = TRUE)


# Display results
print("Balance table created successfully!")
print(balance_results)


# Optional: Display a summary of the balance table
cat("\nBalance Table Summary:\n")
cat("Number of variables tested:", nrow(balance_results), "\n")
cat("Significant differences (p < 0.05):", sum(balance_results$p_value < 0.05), "\n")
cat("LaTeX table saved to:", file_path, "\n")


# Comparison between sample and non-sample villages ----
shrug_bihar <- read_csv("./data/raw/census/shrug_bihar.csv")

# Create the balance table
sample_comparison_results <- data.frame()


# Loop through variables and collect results
for (var in table_vars) {
  # Calculate means and standard deviations for sample and non-sample groups
  stats_sample <- shrug_bihar %>%
    filter(bimli == 1) %>%
    summarise(
      mean = mean(!!sym(var), na.rm = TRUE),
      sd = sd(!!sym(var), na.rm = TRUE)
    )
  
  stats_nonsample <- shrug_bihar %>%
    filter(bimli == 0) %>%
    summarise(
      mean = mean(!!sym(var), na.rm = TRUE),
      sd = sd(!!sym(var), na.rm = TRUE)
    )
  
  # Calculate N for sample and non-sample groups for this variable (using bimli consistently)
  n_sample <- sum(!is.na(shrug_bihar[shrug_bihar$bimli == 1, var]), na.rm = TRUE)
  n_nonsample <- sum(!is.na(shrug_bihar[shrug_bihar$bimli == 0, var]), na.rm = TRUE)
  
  # Store results
  sample_comparison_results <- rbind(sample_comparison_results, data.frame(
    variable = var,
    var_label = var_labels[var],
    mean_sample = round(stats_sample$mean, 3),
    sd_sample = round(stats_sample$sd, 3),
    mean_nonsample = round(stats_nonsample$mean, 3),
    sd_nonsample = round(stats_nonsample$sd, 3),
    n_sample = n_sample,
    n_nonsample = n_nonsample,
    diff = NA,
    se = NA,
    p_value = NA,
    stars = "",
    stringsAsFactors = FALSE
  ))
}

# Create second LaTeX table (after the loop)
file_path_2 <- paste0(output_dir, "tab_bimli_external.tex")

# Write LaTeX table
cat("\\begin{table}[htpb!]\n", file = file_path_2)
cat("\\centering\n", file = file_path_2, append = TRUE)
cat("\\caption{Comparison between sample and non-sample villages}\n", file = file_path_2, append = TRUE)
cat("\\label{tab:bimli_external}\n", file = file_path_2, append = TRUE)
cat("\\begin{tabular}{lcc} \\hline\n", file = file_path_2, append = TRUE)
cat(" & \\textbf{Sample} & \\textbf{Non-Sample} \\\\ \\hline\n", file = file_path_2, append = TRUE)

# Write data rows
for (i in 1:nrow(sample_comparison_results)) {
  row <- sample_comparison_results[i, ]
  cat(sprintf("%s & %.3f & %.3f \\\\\n",
              row$var_label, row$mean_sample, row$mean_nonsample),
      file = file_path_2, append = TRUE)
}

# Add N row at the end
cat("\\hline\n", file = file_path_2, append = TRUE)
# Calculate total N for sample and non-sample (using first variable as reference)
total_n_sample <- sample_comparison_results$n_sample[1]
total_n_nonsample <- sample_comparison_results$n_nonsample[1]
cat(sprintf("N & %d & %d \\\\\n", total_n_sample, total_n_nonsample), file = file_path_2, append = TRUE)
cat("\\hline\n", file = file_path_2, append = TRUE)
cat("\\end{tabular}\n", file = file_path_2, append = TRUE)
cat("\\end{table}\n", file = file_path_2, append = TRUE)

# Display results for second table (once after the loop)
cat("\n", rep("=", 60), "\n", sep = "")
cat("SECOND TABLE: Sample vs Non-Sample Comparison\n")
cat(rep("=", 60), "\n", sep = "")
print(sample_comparison_results)
cat("\nSample vs Non-Sample Table Summary:\n")
cat("Number of variables tested:", nrow(sample_comparison_results), "\n")
cat("LaTeX table saved to:", file_path_2, "\n")

# END of 18_census_balance_test.R ----
